# FLEXIBILITY and PREDICTION ERROR. REGRESSION and SPLINES. CROSS-VALIDATION and TUNING.
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from scipy.interpolate import LSQUnivariateSpline
from sklearn.model_selection import train_test_split

# Load the data
# Set working directory and load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
data = pd.read_csv("Auto.csv")

# Sort data by weight, because our spline tools require X variable in the strictly increasing order
n = len(data);
data['weight'] = data['weight'] + np.random.normal(0,0.001,n);    # To avoid ties, because they won't allow 
data = data.sort_values(by='weight')                              # ...a strictly increasing order

# Fit REGRESSION model predicting miles per gallon based on weight. 
# Pstrictly lot the regression line in red color with thickness=3

reg = ols('mpg ~ weight', data=data).fit()

plt.figure
plt.scatter(data['weight'], data['mpg'], label='Data', s=15)
plt.plot(data['weight'], reg.predict(data['weight']), color='red', linewidth=3, label='Regression Line')
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); plt.title('Linear regression line'); plt.legend(); plt.show()

# Define a SPLINE with some degrees of freedom and plot it in some color.
# Details on splines in Python: https://docs.scipy.org/doc/scipy/tutorial/interpolate/smoothing_splines.html

def plot_spline(data, df, color, label, lw=2):
    # Generate knot positions
    n_knots = df - 1  # since df = number of knots + degree of spline + 1
    knots = np.linspace(min(data['weight']), max(data['weight']), n_knots + 2)[1:-1]  # exclude boundary knots
    spline = LSQUnivariateSpline(data['weight'], data['mpg'], knots, k=3)
    plt.plot(data['weight'], spline(data['weight']), color=color, linewidth=lw, label=label)

# Fit and plot splines with different degrees of freedom, or levels of flexibility.

dfs = [1, 2, 5, 10, 45]
colors = ['cyan', 'magenta', 'blue', 'green', 'orange']

plt.figure(figsize=(10,6))
plt.scatter(data['weight'], data['mpg'], label='Data', s=10)
plt.plot(data['weight'], reg.predict(data['weight']), color='red', lw=2, label='Regression Line')

for df, color in zip(dfs, colors):
   plot_spline(data, df, color, label=f'Spline with df={df}')

plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); plt.title('Fitted splines with different degrees of freedom'); 
plt.legend(); plt.show()

# Splines with too many degrees of freedom are clearly overfitting the data, chasing local points.
# They will be weak for prediction. But linear regression is too stringent, with too few d.f.

# TUNING: How to choose the best degrees of freedom? Let's minimize the mean-squared error!

MSE = []                  # We'll store the mean-squared error here

for df in range(1, 45):
    n_knots = df - 1; 
    knots = np.linspace(min(data['weight']), max(data['weight']), n_knots + 2)[1:-1]  
    spline = LSQUnivariateSpline(data['weight'], data['mpg'], knots, k=3)
    Yhat = spline(data['weight'])
    MSE.append(np.mean((Yhat - data['mpg']) ** 2))

degrees_of_freedom = range(1, 45)
plt.plot(degrees_of_freedom, MSE); 
plt.xlabel('Degrees of Freedom'); plt.ylabel('Mean-Squared Error'); plt.show()

# This is an INTERNAL or TRAINING mean-squared error, within the given data. It improves with more d.f.
# A fair criterion is an EXTERNAL or PREDICTION mean-squared error.

# Cross-validation technique to choose the optimal method.
# We randomly split data into TRAINING and TESTING subsamples

n = len(data)
Z = np.random.choice(n, n//2, replace=False)
train_data = data.iloc[Z]
train_data = train_data.sort_values(by='weight')  
test_data  = data.iloc[-Z]

CV_MSE = []                            # We'll store prediction mean-squared error here

for df in range(1, 45):
    n_knots = df - 1; 
    knots = np.linspace(min(train_data['weight']), max(train_data['weight']), n_knots + 2)[1:-1]  
    spline = LSQUnivariateSpline(train_data['weight'], train_data['mpg'], knots, k=3)
    Yhat = spline(test_data['weight'])
    CV_MSE.append(np.mean((Yhat - test_data['mpg']) ** 2))

plt.plot(degrees_of_freedom, CV_MSE); 
plt.xlabel('Degrees of Freedom'); plt.ylabel('Prediction Mean-Squared Error'); plt.show()

# Restrict the Y axis
plt.plot(degrees_of_freedom, CV_MSE); plt.ylim(1, 40)
plt.xlabel('Degrees of Freedom'); plt.ylabel('Prediction Mean-Squared Error'); plt.show()

# Find the optimal degrees of freedom for this machine learning problem
optimal_df = degrees_of_freedom[np.argmin(CV_MSE)]
print(f"Optimal degrees of freedom: {optimal_df}")

Optimal degrees of freedom: 9